*************************************************************
*This file cleans institution characteristics from the IPEDS*
*************************************************************

use "$clean_data_education/institutionDetails.dta", clear
merge 1:1 unitid yearGroup  using "$clean_data_education/tuitionFees.dta", keep(1 3) nogenerate
merge 1:1 unitid yearGroup  using "$clean_data_education/financialAid.dta", keep(1 3) keepusing(anyaidp avgAidAmount) nogenerate
	
*** control
qui bysort unitid: gen dup = cond(_N==1,0,_n)
	drop if dup == 0 & (control == -3 | control == 0)
replace control =. if control < 1 // missing recoded or not answered
sort unitid year  // try to get info from othe years when missing
by unitid: replace control = control[_n-1] if control ==.
gsort unitid -year
by unitid: replace control = control[_n-1] if control ==.
	label variable control "public control, private no-profit, private for profit"

gen public = 0
	replace public = 1 if control == 1
	label variable public "public control"
	label define public 0 "private (profit/no-profit)" 1 "public"

gen private = (control > 1 & !mi(control))

gen forProfit = 0
	replace forProfit = 1 if control == 3
	label variable forProfit "for-profit control"
	label define forProfit 0 "public or non-profit" 1"private for-profit" 

gen nonProfit = 0
	replace nonProfit = 1 if control == 2
	label variable nonProfit "non-profit control"
	label define nonProfit 0 "public or non-profit" 1"private for-profit" 


*** sector
replace sector = . if sector == 99 // missing recoded or not answered
sort unitid year  // try to get info from othe years when missing
by unitid: replace sector = sector[_n-1] if sector ==.
gsort unitid -year
by unitid: replace sector = sector[_n-1] if sector ==.
drop if sector == 0 // drop administrative unit only

gen communityCollege = (sector >= 4 & sector <= 6)
	label variable communityCollege "community college"
	label value communityCollege communityCollege

gen university = (sector >= 1 & sector <= 3) 


*top uni
* source: https://www.topuniversities.com/where-to-study/north-america/united-states/ranked-top-100-us-universities
* Top20
gen uniTop20 = 0
	replace uniTop20 = 1 if unitid == 166027 // harvard
	replace uniTop20 = 1 if unitid == 243744 // stanford
	replace uniTop20 = 1 if unitid == 166683 // MIT
	replace uniTop20 = 1 if unitid == 110635 // berkeley
	replace uniTop20 = 1 if unitid == 190150 // columbia
	replace uniTop20 = 1 if unitid == 110662 // UCLA
	replace uniTop20 = 1 if unitid == 130794 // yale 
	replace uniTop20 = 1 if unitid == 215062 // pennsylvania
	replace uniTop20 = 1 if unitid == 186131 // princeton
	replace uniTop20 = 1 if unitid == 190415 // cornell
	replace uniTop20 = 1 if unitid == 193900 // NYU
	replace uniTop20 = 1 if unitid == 144050 // chicago
	replace uniTop20 = 1 if unitid == 198419 // duke
	replace uniTop20 = 1 if unitid == 162928 // Johns Hopkins 
	replace uniTop20 = 1 if unitid == 123961 // uni Southern California
	replace uniTop20 = 1 if unitid == 147767 // northwesten
	replace uniTop20 = 1 if unitid == 211440 // carnegie mellon
	replace uniTop20 = 1 if unitid == 170976 // uni of michigan
	replace uniTop20 = 1 if unitid == 110404 // caltech
	replace uniTop20 = 1 if unitid == 217156 // brown
	label variable uniTop20 "top 20 ranking univeristies"

* Top30
gen uniTop30 = 0
	replace uniTop30 = 1 if uniTop20 == 1
	replace uniTop30 = 1 if unitid == 164988 // boston university
	replace uniTop30 = 1 if unitid == 227757 // rice university
	replace uniTop30 = 1 if unitid == 131496 // georgetown 
	replace uniTop30 = 1 if unitid == 236948 | unitid == 377555 | unitid == 377564 // uni of wash.
	replace uniTop30 = 1 if unitid == 228778 // uni of texas at austen
	replace uniTop30 = 1 if unitid == 110680 // uni of calif san diego
	replace uniTop30 = 1 if unitid == 139658 // emory uni
	replace uniTop30 = 1 if unitid == 110644 // uni of calif davis
	replace uniTop30 = 1 if unitid == 179867 // washington uni is st louis
	replace uniTop30 = 1 if unitid == 195030 // uni of rochester 
	label variable uniTop30 "top 30 ranking univeristies"

* Top50
gen uniTop50 = 0
	replace uniTop50 = 1 if uniTop20 == 1
	replace uniTop50 = 1 if uniTop30 == 1
	replace uniTop50 = 1 if unitid == 221999 // vanderbilt uni
	replace uniTop50 = 1 if unitid == 139755 // georgia institute of tech
	replace uniTop50 = 1 if unitid == 145637 // university of illinois at urbana
	replace uniTop50 = 1 if unitid == 131469 // george washington uni
	replace uniTop50 = 1 if unitid == 168148 // tufts uni
	replace uniTop50 = 1 if unitid == 134130 // university of florida
	replace uniTop50 = 1 if unitid == 182670 // dartmouth college
	replace uniTop50 = 1 if unitid == 199120 // university of north carolina, chapel hill
	replace uniTop50 = 1 if unitid == 135726 // university of miami
	replace uniTop50 = 1 if unitid == 152080 // university of notre dame
	replace uniTop50 = 1 if unitid == 186380 | unitid == 186399 | unitid == 186371 // rutgers uni
	replace uniTop50 = 1 if unitid == 110653 // university of california irvine
	replace uniTop50 = 1 if unitid == 201645 // case western reserve 
	replace uniTop50 = 1 if unitid == 145600 // university of illinois chicago
	replace uniTop50 = 1 if unitid == 196097 // stony brook university
	replace uniTop50 = 1 if unitid == 196130 // university at buffalo suny
	replace uniTop50 = 1 if unitid == 214777 // Pennsylvania State University main campus
	replace uniTop50 = 1 if unitid == 164924 // boston college
	replace uniTop50 = 1 if unitid == 163204 // university of maryland
	replace uniTop50 = 1 if unitid == 234076 // university of virginia
	label variable uniTop50 "top 50 ranking univeristies"


* financial aid
egen anyAid = mean(anyaidp) if forProfit == 0, by(unitid) // since this variable is only available from 2000, i take an average
	label variable anyAid "% of students with any financial aid (avg2000-2014)"
egen avgAmountAid = mean(avgAidAmount) if forProfit == 0, by(unitid) // since this variable is only available from 2000, i take an average
	label variable avgAmountAid "avg amount of financial aid (avg2000-2014)"
	drop anyaidp avgAidAmount 


* unmutable informations... keep in only the first year  
sort unitid year 
qui bysort unitid: gen dup2 = cond(_N==1,0,_n)
drop if dup2 > 1
drop dup dup2

rename yearGroup firstYear
	label variable firstYear "first year in the inst. details dataset"

save "$clean_data_education/temp.dta", replace 
********************************************************************
* weight: number of students
clear
use "$clean_data_education/fallEnrollmentTotalsReshaped.dta"
keep unitid year total1 total4
reshape wide total1 total4, i(unitid) j(year)
foreach i in 1990 1994 2000 2007 2014 {
	rename total1`i' students`i'
	rename total4`i' freshmenFullTime`i'
	label variable students`i' "number of students in `i'"
}


egen studentsAvg = rowmean(students1994 students2000 students2007)
	label variable studentsAvg "average # of students over 93-07"

egen freshmenFullTimeAvg = rowmean(freshmenFullTime1994 freshmenFullTime2000 freshmenFullTime2007)
	replace freshmenFullTimeAvg = studentsAvg if !mi(studentsAvg) & (freshmenFullTimeAvg == 0 | mi(freshmenFullTimeAvg))
	label variable freshmenFullTimeAvg "average # of full-time freshmen 93-07"

*Largest 150 universities
gsort -studentsAvg
gen id = _n
gen largest150 = (_n<=150)
gen largest100 = (_n<=100)
gen largest50 = (_n<=50)
	label variable largest150 "the largest 150 universities"
	drop id

	
	
merge 1:m unitid using "$clean_data_education/temp.dta", keep(2 3) nogenerate // 2 and 3 since 1098 are from 1991 but have no institution details while the rest is non-mainland 


keep unitid studentsAvg students???? instnm control sector avgCost freshmenFullTimeAvg university ///
public private communityCollege uniTop* largest* forProfit nonProfit firstYear anyAid avgAmountAid iclevel

save "$clean_data_education/temp2.dta", replace 
********************************************************************

* weight: number of degrees
use "$clean_data_education/completionsAwardedTotalsReshaped.dta", clear
keep unitid year total42 total40 total32 total31 total30 total22 total21 ///
total20 total12 total11 total10

egen awards = rsum (total42 total40 total32 total31 total30 total22 total21 ///
total20 total12 total11 total10)
keep unitid yearGroup awards

reshape wide awards, i(unitid) j(year)
foreach i in 1990 1994 2000 2007 2014 {
label variable awards`i' "number of students awarded in `i'"
}


merge 1:m unitid using "$clean_data_education/temp2.dta", keep(2 3) nogenerate // 2 and 3 since 1098 are from 1991 but have no institution details while the rest is non-mainland 
save "$clean_data_education/temp3.dta", replace 

 
foreach i in 1990 1994 2000 2007 2014 { 
gen students4Y`i' = 0
	replace students4Y`i' = students`i' if (sector >= 1 & sector <= 3) 
gen students2Y`i' = 0
	replace students2Y`i' = students`i' if (sector >= 4 & sector <= 6) 
gen studentsL2Y`i' = 0
	replace studentsL2Y`i' = students`i' if (sector >= 7 & sector <= 9) 
gen awards2Y`i' = 0
	replace awards2Y`i' = awards`i' if (sector >= 4 & sector <= 6) 
}


* Cleanings
merge 1:m unitid using "$clean_data_education/locations.dta", keep(3) keepusing(periods fips) nogenerate 
	drop if fips == 2 | fips == 15 | (fips >= 60&fips<=78) // dropping Alaska, Puerto Rico...etc

drop if forProfit == 1
drop if periods < 3
drop if freshmenFullTimeAvg < 50
qui bysort unitid: gen dup = cond(_N==1,0,_n)
	drop if dup > 1
	drop dup periods fips


save "$final_data_outcomes/IPEDS_institutionsDescriptivesMicro.dta", replace 

******** CZ
use "$clean_data_education/temp3.dta", clear

* Number of uni 
gen numberOfUni = 1 // this will summ up once you do the collapse at czone level
	label variable numberOfUni "number of uni "

* add locations
merge 1:m unitid using "$clean_data_education/locations.dta", keep(3) nogenerate
	drop if fips == 2 | fips == 15 | (fips >= 60&fips<=78) // dropping Alaska, Puerto Rico...etc

drop if forProfit == 1
drop if periods < 3
drop if freshmenFullTimeAvg < 50

foreach i in 1994 2007 {
	preserve
	keep if yearGroup == `i'
	**********************************************************
	* Run it all together, save labels and collapse
	foreach v of var * {
		local l`v' : variable label `v'
		if `"`l`v''"' == "" {
		local l`v' "`v'"
		}
	}
	order unitid czone instnm
	collapse (mean) anyAid (sum) awards`i' students`i' public private communityCollege university [iw=ourRatio], by(czone)

	foreach v of var * {
	label var `v' "`l`v''"
	}
	rename (public private communityCollege university) (public`i' private`i' communityCollege`i' university`i')
	save "$clean_data_education/temp`i'.dta", replace
	restore
}
	

* Majors' shares	
use "$clean_data_education/completionsMajorReshaped.dta", replace
merge 1:m unitid yearGroup using "$clean_data_education/locations.dta", keep(3) nogenerate
	drop if nonMainland == 1 | (yearG == 1990 & mi(czone)) // dropping uni in alaska and so on and dropping some obs in 1991 that have details for enrollment but not on institution details
merge m:1 unitid using "$clean_data_education/institutionControls.dta", keep(3) keepusing(forProfit public freshmenFullTimeAvg studentsAvg sector) nogenerate 
	
	drop if yearGroup == 1990 // comment this if you need the pct analysis
	drop if forProfit == 1
	drop if freshmenFullTimeAvg < 50
	drop if periods < 3
	keep if (sector >= 4 & sector <= 6)  // 2Y institutions
	
* new classification
egen totalNatural = rsum(total1 total2 total3 total26 total27 total40)
egen totalEngCom = rsum(total4 total10 total11 total14 total15 total41)
egen totalSocialS = rsum(total5 total9 total13 total19 total20 total42 total45)
egen totalBusEco = rsum(total6 total7 total8 total52)
egen totalHumArts = rsum(total12 total16 total22 total23 total24 total25 total30 total31 total38 total39 total50 total54)
egen totalManuf = rsum(total46 total47 total48 total49)
egen totalHealthS = rsum(total17 total18 total51)
egen totalPubMil = rsum(total28 total29 total43 total44)
egen totalMajors = rsum(totalNatural totalEngCom totalSocialS totalBusEco totalHumArts totalManuf totalHealthS totalPubMil)

keep unitid yearGroup czone fips total* ourRatio 

*** collapse 
foreach i of var total* {
replace `i' = `i' * ourRatio 
}


collapse (sum) total*, by(czone yearGroup)

* merge populations data
merge 1:1 czone yearGroup using "$clean_data_lmarket/census_PopulationDataCZ.dta", assert(2 3)
	drop if _merge == 2 // since we do not have all the CZs in IPEDS data 
	drop _merge


foreach i in Natural EngCom SocialS BusEco HumArts Manuf HealthS PubMil {
	gen share`i' = total`i'/totalMajors 
	}

drop if totalMajors == 0
keep yearGroup czone shareNatural shareEngCom shareSocialS shareBusEco shareHumArts shareManuf shareHealthS sharePubMil

foreach i in 1994 2007 {
	preserve
	keep if yearGroup == `i'
	rename share* share*`i'
	save "$clean_data_education/tempSh`i'.dta", replace
	restore
}
	use "$clean_data_education/temp1994.dta", clear
	merge 1:1 czone using "$clean_data_education/temp2007.dta", nogenerate
	merge 1:1 czone using "$clean_data_education/tempSh1994.dta", nogenerate
	merge 1:1 czone using "$clean_data_education/tempSh2007.dta", nogenerate

save "$final_data_outcomes/IPEDS_institutionsDescriptivesCZ.dta", replace 
drop anyAid yearGroup
rm "$clean_data_education/temp.dta"
rm "$clean_data_education/temp2.dta"
rm "$clean_data_education/temp3.dta"
rm "$clean_data_education/temp1994.dta"
rm "$clean_data_education/temp2007.dta"
rm "$clean_data_education/tempSh1994.dta"
rm "$clean_data_education/tempSh2007.dta"
